The current experiment is a cell multiplex experiment involving EpCam+ sorted mouse Thymic Epithelial Fibroblast, where 6 samples were hash tagged using Totalseq A HashTag antibodies and combined together for a single run in 10x chromium. The GEX was preprocessed with cellranger 8.0 and the CiteSeq data was preprocessed using Cite-Seq-Count. Both analysis yielded matrix files that is being used to create a Seurat object.
library(dplyr)
library(Seurat)
library(sctransform)
library(patchwork)
library(ggplot2)
library(glmGamPoi)
library(lme4)
library(magrittr)
library(scCustomize)
library(presto)
library(edgeR)
library(matrixStats)
library(tibble)
library(ggrepel)
lapply(c("HGNChelper","openxlsx"), library, character.only = T)
## [[1]]
## [1] "HGNChelper" "ggrepel" "tibble" "matrixStats" "edgeR"
## [6] "limma" "presto" "data.table" "Rcpp" "scCustomize"
## [11] "magrittr" "lme4" "Matrix" "glmGamPoi" "ggplot2"
## [16] "patchwork" "sctransform" "Seurat" "SeuratObject" "sp"
## [21] "dplyr" "stats" "graphics" "grDevices" "utils"
## [26] "datasets" "methods" "base"
##
## [[2]]
## [1] "openxlsx" "HGNChelper" "ggrepel" "tibble" "matrixStats"
## [6] "edgeR" "limma" "presto" "data.table" "Rcpp"
## [11] "scCustomize" "magrittr" "lme4" "Matrix" "glmGamPoi"
## [16] "ggplot2" "patchwork" "sctransform" "Seurat" "SeuratObject"
## [21] "sp" "dplyr" "stats" "graphics" "grDevices"
## [26] "utils" "datasets" "methods" "base"
library(SCpubr)
library(decoupleR)
library(ReactomeGSA)
gex_data<-Read10X("filtered_feature_bc_matrix/")
hto_data<-Read10X("umi_count/", gene.column=1)
hto_data <- hto_data[-c(7),]
current_col_names <- colnames(hto_data)
new_col_names <- paste0(current_col_names, "-1")
colnames(hto_data) <- new_col_names
current_row_names <- rownames(hto_data)
new_row_names <- gsub("-.*", "", current_row_names)
rownames(hto_data) <- new_row_names
dim(hto_data)
## [1] 6 9116
dim(gex_data)
## [1] 33696 9116
joint.bcs <- intersect(colnames(gex_data), colnames(hto_data))
gex_data <- gex_data[, joint.bcs]
hto_data <- as.matrix(hto_data[, joint.bcs])
# remove the variable unmapped from HTO object
dim(hto_data)
## [1] 6 8856
dim(gex_data)
## [1] 33696 8856
Creating a seurat object with the gene expression data and adding the HTO data as a seperate assay
data <- CreateSeuratObject(counts = gex_data,
project = "Jason data", #Name this whatever.
names.delim = NULL) # Don't try and parse the sample names
HTO <- CreateAssay5Object(counts = hto_data)
## Warning: Data is of class matrix. Coercing to dgCMatrix.
data[["HTO"]] <- HTO
#Assess percent of mitochondrial counts in each cell
data[["percent_mt"]] <- PercentageFeatureSet(object = data,
pattern = "^mt-")
#Violin plot
VlnPlot(object = data,
features = c("nFeature_RNA",
"nCount_RNA",
"percent_mt"),
ncol = 3)
## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.
#Other plotting otions
plot1 <- FeatureScatter(object = data,
feature1 = "nCount_RNA",
feature2 = "percent_mt")
plot2 <- FeatureScatter(object = data,
feature1 = "nCount_RNA",
feature2 = "nFeature_RNA")
plot1 + plot2
#Subset data
data <- subset(x = data,
subset = nFeature_RNA > 200 &
nCount_RNA > 5000 &
percent_mt < 15)
VlnPlot(object = data,
features = c("nFeature_RNA",
"nCount_RNA",
"percent_mt"),
ncol = 3)
## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.
options(future.globals.maxSize = 1024^3 * 4) # Set to 4 GiB
data <- SCTransform(data, method= "glmGamPoi", verbose = FALSE)
data <- NormalizeData(data, assay = "HTO", normalization.method = "CLR")
## Normalizing layer: counts
## Normalizing across features
data <- HTODemux(data, assay = "HTO", positive.quantile = 0.99)
## As of Seurat v5, we recommend using AggregateExpression to perform pseudo-bulk analysis.
## This message is displayed once per session.
## First group.by variable `ident` starts with a number, appending `g` to ensure valid variable names
## Cutoff for KO1 : 15 reads
##
## Cutoff for KO2 : 11 reads
##
## Cutoff for WT1 : 51 reads
##
## Cutoff for KO3 : 12 reads
##
## Cutoff for WT2 : 31 reads
##
## Cutoff for WT3 : 95 reads
##
## This message is displayed once every 8 hours.
table(data$HTO_classification.global)
##
## Doublet Negative Singlet
## 853 4 6688
Idents(data) <- "HTO_classification.global"
VlnPlot(data, features = "nCount_RNA", pt.size = 0.1, log = TRUE)
##Visualize HTO data
DefaultAssay(data) <- "HTO"
data <- ScaleData(data, features = rownames(data),
verbose = FALSE)
data <- RunPCA(data, features = rownames(data), approx = FALSE)
## Warning: Requested number is larger than the number of available items (6).
## Setting to 6.
## Warning: Requested number is larger than the number of available items (6).
## Setting to 6.
## Warning: Requested number is larger than the number of available items (6).
## Setting to 6.
## Warning: Requested number is larger than the number of available items (6).
## Setting to 6.
## Warning: Requested number is larger than the number of available items (6).
## Setting to 6.
## PC_ 1
## Positive: KO1, KO2, WT1
## Negative: WT3, WT2, KO3
## PC_ 2
## Positive: KO3, WT2, WT1
## Negative: WT3, KO1, KO2
## PC_ 3
## Positive: WT2, KO2, KO1
## Negative: WT1, KO3, WT3
## PC_ 4
## Positive: WT1, WT2, KO1
## Negative: KO3, WT3, KO2
## PC_ 5
## Positive: KO2, WT1, KO3
## Negative: KO1, WT2, WT3
datat <- RunTSNE(data, dims = 1:8, perplexity = 100)
## Warning in `[[.DimReduc`(args$object, cells, args$dims): The following
## embeddings are not present: NA, NA
DimPlot(data)
##Extract singlet data and perform further processing
singlets <- subset(data, idents = "Singlet")
DefaultAssay(singlets) <- "SCT"
# Use the highly variable genes to find principal components
singlets <- RunPCA(object = singlets, verbose = FALSE)
## Warning: Number of dimensions changing from 6 to 50
#Examine and visualize PCA results a few different ways
print(x = singlets[["pca"]], dims = 1:5, nfeatures = 5)
## PC_ 1
## Positive: Ctsl, Prss16, Ccl25, Tbata, Cdh4
## Negative: Nrg1, Syt1, Srgn, Dpp10, Cd52
## PC_ 2
## Positive: Ccl25, Tbata, Prss16, Fabp5, Fgf14
## Negative: Ccl21a, Kcnq3, Gm36660, Csmd1, Krt5
## PC_ 3
## Positive: Grid1, Dpp10, Top2a, Hmgb2, H2ac24
## Negative: AW112010, Fam3d, Atp1b1, Reg3g, Ly6a
## PC_ 4
## Positive: Ccl21a, Lifr, S100a8, Nrg1, Kcnq3
## Negative: Top2a, Hmgb2, H2ac24, H1f5, Mki67
## PC_ 5
## Positive: Gpc5, Iigp1, S100a8, Top2a, Arhgap6
## Negative: Ccl25, Tbata, Wfdc18, Ccl21a, Syt1
DimPlot(singlets, reduction = "pca")
ElbowPlot(singlets)
VizDimLoadings(object = singlets, dims = 1:2, reduction = "pca")
DimPlot(singlets, reduction = "pca", group.by = "HTO_classification")
singlets <- RunUMAP(singlets, dims = 1:10, verbose = FALSE)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
DimPlot(singlets, reduction = "umap")
DimPlot(singlets, reduction = "umap", group.by = "HTO_classification")
p1 <- SCpubr::do_DimPlot(sample = singlets,
label = FALSE,
group.by = "HTO_classification",
raster = FALSE,
repel=TRUE, legend.position = "bottom")
singlets <- FindNeighbors(singlets, dims = 1:10)
## Computing nearest neighbor graph
## Computing SNN
singlets <- FindClusters(singlets, resolution = 0.5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 6688
## Number of edges: 209140
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8921
## Number of communities: 11
## Elapsed time: 0 seconds
DimPlot(singlets, reduction = "umap", label = TRUE)
DimPlot(singlets, reduction = "pca", label = TRUE)
p2 <- SCpubr::do_DimPlot(sample = singlets,
label = FALSE,
raster = FALSE,
repel=TRUE, legend.position = "bottom")
##Cell Annotation
# find markers for every cluster compared to all remaining cells, report only the positive ones
markers <- FindAllMarkers(singlets,
only.pos = TRUE,
min.pct = 0.25,
logfc.threshold = 0.25)
## Calculating cluster 0
## Calculating cluster 1
## Calculating cluster 2
## Calculating cluster 3
## Calculating cluster 4
## Calculating cluster 5
## Calculating cluster 6
## Calculating cluster 7
## Calculating cluster 8
## Calculating cluster 9
## Calculating cluster 10
markers %>%
group_by(., cluster) %>%
top_n(., n = 10, wt = avg_log2FC) -> top10
DoHeatmap(singlets, features = top10$gene) + NoLegend()
## Warning in DoHeatmap(singlets, features = top10$gene): The following features
## were omitted as they were not found in the scale.data slot for the SCT assay:
## Gm6961, Dgkeos, Trdc, Rag1, Ptprcap, Sh2d1a, Cd8b1, Cd8a, Col6a2, Gfra2, C1s1,
## Tnfsf10, Sult5a1, Pif1, H2bc14, H3c2, H2ac10, H2ac4, Gm13944, Gm15261, Etv5,
## Slc38a4, Scn1a, Ackr4, Adhfe1, Klf15, Tepp, Col8a2, Gm15398, 1700024B18Rik,
## Frem2, Scx, D630003M21Rik, Kcna2, Wnt10b, Tvp23a, Cdh22, A630023P12Rik,
## Gm29481, Dusp28
sample_names <- gsub("[0-9]", "", singlets$HTO_classification)
phenotype <- ifelse(grepl("WT", sample_names), "WT", "KO")
singlets$phenotype <- factor(phenotype)
DimPlot_scCustom(singlets, reduction = "umap", split.by = "phenotype")&theme_bw() & theme()
##
## NOTE: DimPlot_scCustom returns split plots as layout of all plots each
## with their own axes as opposed to Seurat which returns with shared x or y axis.
## To return to Seurat behvaior set `split_seurat = TRUE`.
##
## -----This message will be shown once per session.-----
p3 <- SCpubr::do_DimPlot(sample = singlets,
label = FALSE,
split.by = "phenotype",
raster = FALSE,
repel=TRUE, legend.position = "bottom")
genes_list_1 <- c("Psmb11", "Dll4", "Ccl25", "Enpep", "Ly75", "Cd83")
genes_list_2 <- c("Prss16", "Ackr4", "Aire", "Fezf2", "Cd80", "Krt5")
genes_list_3 <- c("Itga6", "Ccl21a", "Ccl19", "Irga2", "Krt10", "Pigr")
genes_list_4 <- c("H2-Aa", "Cldn3", "Cldn4", "Tnfrsf11a", "Epcam")
FeaturePlot_scCustom(singlets, genes_list_1 ) + plot_annotation(title = "Gene list 1", theme = theme(plot.title = element_text(size = 20, hjust = 0.5)))&theme_bw() & theme()
##
## NOTE: FeaturePlot_scCustom uses a specified `na_cutoff` when plotting to
## color cells with no expression as background color separate from color scale.
## Please ensure `na_cutoff` value is appropriate for feature being plotted.
## Default setting is appropriate for use when plotting from 'RNA' assay.
## When `na_cutoff` not appropriate (e.g., module scores) set to NULL to
## plot all cells in gradient color palette.
##
## -----This message will be shown once per session.-----
FeaturePlot_scCustom(singlets, genes_list_2 ) + plot_annotation(title = "Gene list 2", theme = theme(plot.title = element_text(size = 20, hjust = 0.5)))&theme_bw() & theme()
FeaturePlot_scCustom(singlets, genes_list_3 ) + plot_annotation(title = "Gene list 3", theme = theme(plot.title = element_text(size = 20, hjust = 0.5)))&theme_bw() & theme()
## Warning: The following features were omitted as they were not found:
## ℹ Irga2
FeaturePlot_scCustom(singlets, genes_list_4 ) + plot_annotation(title = "Gene list 4", theme = theme(plot.title = element_text(size = 20, hjust = 0.5)))&theme_bw() & theme()
Previously Campinoti et al., 2020 published a detailed cellular overview of the human postnatal thymus. We took the gene signatures corresponding to mTEC, cTEC and comTEC and applied to our dataset to classify TECs.
early_progenitor_gene_list <- list(c("Ackr4", "Adamts10", "Agrn", "Aldh2", "Aldh6a1", "Amotl1", "Amotl2", "Antxr1", "Apoe",
"Ar", "Bcam", "Bcl11a", "Bcl2", "Bmp4", "Btg2", "Cbx6", "Ccdc80", "Cdh11", "Cldn8",
"Clec11a", "Clstn1", "Col18a1", "Cpne8", "Cthrc1", "Cyp1b1", "Dcn", "Ddr1", "Dhrs3",
"Dlk2", "Dnajc13", "Dpp6", "Dpysl2", "Dsc3", "Egr1", "Eid1",
"Fkbp9", "Fmod", "Fos", "Fosb", "Frmd6", "Fstl1", "Ogn", "Gas1", "Pak3", "Gbp2",
"Palld", "Gnaq", "Pdpn", "Gpm6b", "Penk", "Gprasp1", "Plxdc2", "Gstm2", "Pmp22",
"H2-DMa", "Prelp", "Hes6", "Prrg3", "Hic1", "Prss23", "Hsd17b10", "Ptprz1", "Igfbp2",
"Igfbp3", "Pygb", "Igfbp5", "Rbp1", "Igfbp7", "Rnase4", "Iigp1", "Scn1a",
"Il33", "Serpinf1", "Irgm1", "Serpinh1", "Isl1", "Shisa2", "Itm2c", "Slc2a13", "Kazald1",
"Sord", "Lamb1", "Sparc", "Laptm4a", "Spon2", "Limch1", "Spry1", "Ltbp3", "Tcn2",
"Maged1", "Tgfbr2", "Megf6", "Tgfbr3", "Meis1", "Thbd", "Mgll", "Thbs1", "Mgp",
"Timp2", "Myl9", "Tinagl1", "Mylk", "Tnfrsf19", "Nbl1", "Tns1", "Nell2", "Tns3",
"Nfia", "Trim29", "Nfib", "Trp63", "Nfix", "Tspan9", "Nr2f1", "Twsg1", "Nr4a1",
"Txnip", "Nrtn", "Unc119", "Ntrk3", "Vmac", "Oat", "Wls", "Wscd1", "Xdh", "Zfp36"))
postnatal_progenitors_gene_list <- list(c("Acta2", "Apoe", "Ascl1", "Boc", "C1s1", "C3", "Cald1", "Ccl11", "Ccl21a", "Clca3a1",
"Col6a1", "Col6a2", "Ddx60", "Dpysl3", "Dst", "Emp2", "Flna",
"Fst", "Fzd2", "Gas1", "Glul", "Gpx3", "Gsn", "Hpgd", "Htra1", "Id1", "Ifi27l2a",
"Igfbp4", "Igfbp5", "Irf7", "Isg15", "Itga6", "Itgb4", "Krt14", "Krt5", "Krt7",
"Lamb3", "Lars2", "Lifr", "Mgp", "Myl9", "Nrbp2", "S1pr3", "Slc4a11", "Sox4", "Stat2",
"Sult5a1", "Tagln", "Tgfbi", "Tpm2", "Wfikkn2"))
cTEC_gene_list <- list(c("Ccl25", "Cd274", "Cfc1", "Ctsh", "Foxn1", "Kcnip3", "Ly75", "Prss16", "Psmb11", "Scx", "Slc46a2", "Tbx1"))
mTEC_gene_list <- list(c("AA467197", "Adap1", "Adgb", "Aebp2", "Ahcyl2", "Aif1l", "Aire", "Alas1", "Ank", "Ankrd33b",
"Anxa10", "Aoc1", "Apoa1", "Apoa4", "Apobec1", "Apoc2", "Aqp3", "Arc", "Asprv1", "Atp1a2",
"Atp1b1", "Atp6v1c1", "AU040320", "Avil", "AW112010", "Bcl2a1a", "Bcl2l1", "Blnk",
"Bmp2k", "Cadps", "Calca", "Calcb", "Casp1", "Casz1", "Ccdc184", "Ccdc88a", "Ccl17", "Ccl20",
"Ccl22", "Ccl27a", "Ccl5", "Ccl6", "Ccl9", "Ccr7", "Cd52", "Cd70", "Cdh17", "Cdhr5", "Cdkn1a",
"Cdkn1c", "Cdkn2b", "Cdx1", "Chil1", "Ckmt1", "Cldn13", "Cldn3", "Cldn4", "Cldn7", "Cnp",
"Cpeb4", "Crabp1", "Crhbp", "Csn2", "Cst6", "Ctrb1", "Ctsh", "Ctss", "Ctsz", "Cxcl2", "Cyba",
"Cybb", "Cyp2a4", "Cystm1", "Defb6", "Dgat2", "Dio1", "Dmkn", "Dmpk", "Dnase1l3",
"Dscaml1", "Eaf2", "Ebi3", "Ehd1", "Elf3", "Eno2", "Espn", "Etv3", "Fabp4", "Fabp6",
"Fabp9", "Fam25c", "Fam89a", "Fcer1g", "Fezf2", "Fgf21", "Fhad1", "Fnbp1", "Fscn1", "Gas7",
"Gbp4", "Gda", "Gdf15", "Gjb2", "Gnat3", "Gnb3", "Gnb4", "Gng13", "Gpa33", "Gramd4", "Grap",
"Grin2c", "Gstm1", "Gstt1", "Guca2b", "H2-Eb2", "H2-Oa", "Hagh", "Hal", "Hamp", "Hdc", "Hopx",
"Icosl", "Igf1", "Igfbp6", "Igsf8", "Il12a", "Il13", "Il1rn", "Il23a", "Il2rg", "Ing1", "Inpp5b",
"Insm1", "Itgb8", "Kcnk6", "Kctd12", "Klk1", "Klk1b11", "Klk7", "Krt10", "Krt16", "Krt20", "Krt77",
"Krt79", "Lad1", "Laptm5", "Lcp1", "Liph", "Lrrc42", "Lsp1", "Ly6i", "Lypd2", "Lypd8", "Lyz1",
"Malat1", "Mctp1", "Mdm2", "Me1", "Mep1a", "Mmp7", "Muc1", "Muc13", "Mxd1",
"Myl7", "Myo15b", "N4bp2l1", "Ncf1", "Ncf4", "Neat1", "Nfkbia", "Nos2", "Nostrin", "Nptx2", "Nsmce1",
"Nsmce2", "Nts", "Nup85", "Oasl1", "Ogfrl1", "Ooep", "Pcp4", "Pgc", "Pglyrp1", "Pglyrp2",
"Pigr", "Pip5k1b", "Pla1a", "Pla2g4a", "Plagl1", "Plb1", "Pld1", "Plekha4", "Pmaip1", "Prap1", "Prg2",
"Prg4", "Prokr2", "Psors1c2", "Ptgds", "Ptgs2", "Pyy", "Rab25", "Rac2", "Rap1gap", "Rarres1", "Reg3g",
"Resp18", "Rnasel", "Rnf128", "Rogdi", "S100a14", "S100a4", "S100a8", "S100a9", "S100g", "Saa3", "Sat1",
"Sel1l3", "Sema4a", "Serpinb12", "Serpinb1a", "Serpinb2", "Serpinb9", "Sgpp1", "Sh2d6", "Sh3tc2",
"Sirt1", "Skint2", "Skint4", "Skint9", "Slc13a2", "Slc43a3", "Wfdc21", "Slc4a8", "Slc5a8", "Slc6a20a",
"Slco5a1", "Smtnl1", "Sncg", "Spink1", "Spink5", "Spock3"))
singlets <- AddModuleScore(object = singlets, features = early_progenitor_gene_list, name = "earlypro_score")
singlets <- AddModuleScore(object = singlets, features = postnatal_progenitors_gene_list, name = "postnatal_score")
singlets <- AddModuleScore(object = singlets, features = mTEC_gene_list, name = "mTEC_score")
singlets <- AddModuleScore(object = singlets, features = cTEC_gene_list, name = "cTEC_score")
plot1<-FeaturePlot_scCustom(singlets, features="mTEC_score1", reduction="umap", colors_use= viridis_plasma_dark_high, na_color="lightgray", pt.size=0.5, na_cutoff = 0.5)&theme_bw() & theme()& labs(title = "mTEC signature")
## Warning: Some of the plotted features are from meta.data slot.
## • Please check that `na_cutoff` param is being set appropriately for those
## features.
plot2<-FeaturePlot_scCustom(singlets, features="cTEC_score1", reduction="umap", colors_use= viridis_plasma_dark_high, na_color="lightgray", pt.size=0.5, na_cutoff = 0.5)&theme_bw() & theme()& labs(title = "cTEC signature")
## Warning: Some of the plotted features are from meta.data slot.
## • Please check that `na_cutoff` param is being set appropriately for those
## features.
plot3<-FeaturePlot_scCustom(singlets, features="earlypro_score1", reduction="umap", colors_use= viridis_plasma_dark_high, na_color="lightgray", pt.size=0.5, na_cutoff = 0.5)&theme_bw() & theme()& labs(title = "Early Progenitor signature")
## Warning: Some of the plotted features are from meta.data slot.
## • Please check that `na_cutoff` param is being set appropriately for those
## features.
plot4<-FeaturePlot_scCustom(singlets, features="postnatal_score1", reduction="umap", colors_use= viridis_plasma_dark_high, na_color="lightgray", pt.size=0.5, na_cutoff = 0.5)&theme_bw() & theme()& labs(title = "Postnatal Progenitor signature")
## Warning: Some of the plotted features are from meta.data slot.
## • Please check that `na_cutoff` param is being set appropriately for those
## features.
plot1|plot2|plot3|plot4
#Classifying cluster 3 and 7
cluster3.markers <- FindMarkers(singlets,
ident.1 = 3,
min.pct = 0.25)
head(cluster3.markers, n = 10)
## p_val avg_log2FC pct.1 pct.2 p_val_adj
## Tmem158 0.000000e+00 2.815598 0.921 0.333 0.000000e+00
## Utf1 1.027280e-300 2.533964 0.888 0.274 2.857174e-296
## Icosl 9.957734e-300 2.349307 0.918 0.284 2.769545e-295
## Cdx1 2.345691e-287 2.443963 0.884 0.278 6.524070e-283
## Ikzf1 1.270043e-268 2.340663 0.918 0.299 3.532370e-264
## Cdh22 1.274582e-264 3.156383 0.598 0.104 3.544995e-260
## Hnf1a 6.090882e-259 2.521828 0.758 0.196 1.694057e-254
## Slc4a8 7.881342e-246 2.591595 0.766 0.211 2.192038e-241
## Snx8 3.261791e-240 2.073517 0.895 0.341 9.072020e-236
## Tvp23a 2.511083e-237 2.621406 0.696 0.174 6.984076e-233
cluster7.markers <- FindMarkers(singlets,
ident.1 = 7,
min.pct = 0.25)
head(cluster7.markers, n = 10)
## p_val avg_log2FC pct.1 pct.2 p_val_adj
## Kif11 0 4.088734 0.959 0.087 0
## Nusap1 0 4.586684 0.952 0.083 0
## Pclaf 0 4.258100 0.961 0.107 0
## Cdca8 0 4.049844 0.937 0.094 0
## Birc5 0 4.042079 0.931 0.088 0
## Knl1 0 4.326299 0.905 0.066 0
## Kif4 0 3.831519 0.931 0.102 0
## Cdk1 0 4.259671 0.944 0.115 0
## Top2a 0 4.740669 0.998 0.170 0
## Kif15 0 4.161737 0.883 0.063 0
##Finalized cell annotation From the above classifications there are the inferred cell annotation Cluster 0 - mTEC3 Cluster 1 - mTEC2 Cluster 2 - mature cTEC Cluster 3 - mTEC1 Cluster 4 - Postnatal progenitor cells Cluster 5 - immature cTEC Cluster 6 - Early progenitor cells Cluster 7 - Transient amplifying progenitor Cluster 8 - Postnatal progenitor cells Cluster 9 - Residual Cluster 10 - proliferating cTEC
cell_annotation <- data.frame(cluster=0:10,
cell_type=c("mTEC3", "mTEC2", "mature cTEC", "mTEC1", "PNP", "immature cTEC", "Early progenitors", "Transient amplifying progenitors", "PNP", "Residual", "proliferating cTEC"))
new.cluster.ids <- c("mTEC3", "mTEC2", "mature cTEC", "mTEC1", "PNP", "immature cTEC", "Early progenitors", "Transient amplifying progenitors", "PNP", "Residual", "proliferating cTEC")
names(new.cluster.ids) <- levels(singlets)
singlets <- RenameIdents(singlets, new.cluster.ids)
singlets$cell_type <- Idents(singlets)
DimPlot_scCustom(singlets, reduction = "umap", label = TRUE, repel = TRUE)&theme_bw() & theme()
p4 <- SCpubr::do_DimPlot(sample = singlets,
label = FALSE,
raster = FALSE,
repel=TRUE, legend.position = "right")
p4
p1|p2|p4
interferon_markers <- c("Mx1", "Isg15", "Oas1a", "Oas1b", "Oas1g", "Oas2", "Oas3",
"Rsad2", "Ifit1", "Ifit2", "Ifit3", "Ifi44", "Stat1",
"Stat2", "Irf7", "Cxcl10", "H2-Ab1", "Irf1",
"Cxcl9", "Ido1", "Icam1", "Ciita", "Eif2ak2", "Isg20",
"Zbp1", "Bst2")
# Define mouse interferon response markers for each type
interferonI_genes <- c("Mx1", "Isg15", "Oas1a", "Oas1b", "Oas1g", "Oas2", "Oas3", "Rsad2", "Ifit1", "Ifit2", "Ifit3", "Ifi44", "Stat1", "Stat2", "Irf7", "Cxcl10")
interferonII_genes <- c("H2-Ab1", "Irf1", "Cxcl9", "Cxcl10", "Ido1", "Icam1", "Ciita")
interferonIII_genes <- c("Il10rb", "Mx1", "Oas1a", "Oas1b", "Oas1g", "Oas2", "Oas3", "Ifit1", "Ifit2", "Ifit3")
singlets <- AddModuleScore(singlets, features = list(interferon_markers), name = "interferon_score")
singlets <- AddModuleScore(singlets, features = list(interferonI_genes), name = "interferon1_score")
singlets <- AddModuleScore(singlets, features = list(interferonII_genes), name = "interferon2_score")
singlets <- AddModuleScore(singlets, features = list(interferonIII_genes), name = "interferon3_score")
plot5<-FeaturePlot_scCustom(singlets, features="interferon_score1", reduction="umap", colors_use= viridis_plasma_dark_high, na_color="lightgray", pt.size=0.5, na_cutoff = 0.1)&theme_bw() & theme()& labs(title = "Interferon signature")
## Warning: Some of the plotted features are from meta.data slot.
## • Please check that `na_cutoff` param is being set appropriately for those
## features.
plot6<-FeaturePlot_scCustom(singlets, features="interferon1_score1", reduction="umap", colors_use= viridis_plasma_dark_high, na_color="lightgray", pt.size=0.5, na_cutoff = 0.1)&theme_bw() & theme()& labs(title = "Type I Interferon signature")
## Warning: Some of the plotted features are from meta.data slot.
## • Please check that `na_cutoff` param is being set appropriately for those
## features.
plot7<-FeaturePlot_scCustom(singlets, features="interferon2_score1", reduction="umap", colors_use= viridis_plasma_dark_high, na_color="lightgray", pt.size=0.5, na_cutoff = 0.1)&theme_bw() & theme()& labs(title = "Type II Interferon signature")
## Warning: Some of the plotted features are from meta.data slot.
## • Please check that `na_cutoff` param is being set appropriately for those
## features.
plot8<-FeaturePlot_scCustom(singlets, features="interferon3_score1", reduction="umap", colors_use= viridis_plasma_dark_high, na_color="lightgray", pt.size=0.5, na_cutoff = 0.1)&theme_bw() & theme()& labs(title = "Type III Interferon signature")
## Warning: Some of the plotted features are from meta.data slot.
## • Please check that `na_cutoff` param is being set appropriately for those
## features.
plot5|plot6|plot7|plot8
gene_list_5 <- c("Zfp36l1", "Zfp36l2", "Ets2", "Arnt2", "Tshz2", "Irx3")
gene_list_6 <- c("Klf12", "Aebp1", "Ebf1", "Mafa", "Nkx6-2", "Pou2af1")
gene_list_7 <- c("Scx", "Cdkn1a", "Cdkn1c", "Ccnd2", "Cables1", "Cdk14")
gene_list_8 <- c("Rspo1", "Sfrp1", "Shisa4", "Shisal1")
FeaturePlot_scCustom(singlets, gene_list_5) + plot_annotation(title = "Gene list 1", theme = theme(plot.title = element_text(size = 20, hjust = 0.5)))&theme_bw() & theme()
FeaturePlot_scCustom(singlets, gene_list_6) + plot_annotation(title = "Gene list 1", theme = theme(plot.title = element_text(size = 20, hjust = 0.5)))&theme_bw() & theme()
FeaturePlot_scCustom(singlets, gene_list_7) + plot_annotation(title = "Gene list 1", theme = theme(plot.title = element_text(size = 20, hjust = 0.5)))&theme_bw() & theme()
FeaturePlot_scCustom(singlets, gene_list_8) + plot_annotation(title = "Gene list 1", theme = theme(plot.title = element_text(size = 20, hjust = 0.5)))&theme_bw() & theme()
##Odds ratio analysis We are calculating the probability of a cell being in a cluster based on whether the cell is WT versus KO.
sample_pheno_data <- singlets@meta.data %>% select(HTO_classification, phenotype) %>% unique()
##the number of cells in each mouse in each cluster
Ncells <- table(singlets@meta.data$cell_type,
singlets@meta.data$HTO_classification) %>% as.data.frame()
colnames(Ncells) <- c("cluster_id", "HTO_classification", "Freq")
TotalCells <- Ncells %>%
group_by(HTO_classification) %>%
summarise(TotalCells = sum(Freq))
Ncells %<>% merge(., TotalCells)
Ncells %<>% merge(., sample_pheno_data)
Clusters <- unique(Ncells$cluster_id)
Nclusters <- length(Clusters)
##function to estimate the change in the odds of cluster membership from the 5 day to the 11 day time-point
estimateCellStateChange <- function(k, Ncells, TotalCells, SampleInfo) {
require(lme4)
require(gdata)
print(paste("Cluster", k))
Ncells_sub <- Ncells %>%
filter(cluster_id==k)
glmerFit <- glmer(cbind(Freq, (TotalCells - Freq)) ~ (1|HTO_classification) + phenotype, data=Ncells_sub, family = "binomial", control = glmerControl(optimizer = "bobyqa"))
sglmerFit <- summary(glmerFit)
TempRes <- (sglmerFit$coefficients[-1,])
return(TempRes)
}
ClusterRes <- sapply(Clusters, estimateCellStateChange, Ncells, TotalCells, SampleInfo)
## Loading required package: gdata
## Warning: package 'gdata' was built under R version 4.4.1
## Registered S3 method overwritten by 'gdata':
## method from
## reorder.factor gplots
##
## Attaching package: 'gdata'
## The following objects are masked from 'package:data.table':
##
## first, last
## The following objects are masked from 'package:dplyr':
##
## combine, first, last, starts_with
## The following object is masked from 'package:stats':
##
## nobs
## The following object is masked from 'package:utils':
##
## object.size
## The following object is masked from 'package:base':
##
## startsWith
## [1] "Cluster mTEC3"
## boundary (singular) fit: see help('isSingular')
## [1] "Cluster mTEC2"
## [1] "Cluster mature cTEC"
## boundary (singular) fit: see help('isSingular')
## [1] "Cluster mTEC1"
## [1] "Cluster PNP"
## [1] "Cluster immature cTEC"
## [1] "Cluster Early progenitors"
## [1] "Cluster Transient amplifying progenitors"
## [1] "Cluster Residual"
## [1] "Cluster proliferating cTEC"
## boundary (singular) fit: see help('isSingular')
ClusterRes %<>%
as.data.frame() %>%
t()
row.names(ClusterRes) <- paste0("Cluster", Clusters)
ClusterRes <- data.frame(ClusterRes)
colnames(ClusterRes)[c(1,4)] <- c("logOddsRatio_WT_vs_KO","pvalue")
##perform multiple-testing correction
ClusterRes %<>% mutate(p.adjust = p.adjust(pvalue, method = "BH"))
print(ClusterRes)
## logOddsRatio_WT_vs_KO Std..Error
## ClustermTEC3 -0.2821077 0.06454679
## ClustermTEC2 -0.4800539 0.09830424
## Clustermature cTEC 1.7507792 0.10025562
## ClustermTEC1 -0.7336324 0.10685314
## ClusterPNP -0.9135195 0.08201985
## Clusterimmature cTEC 1.4607543 0.23857779
## ClusterEarly progenitors 1.3344321 0.18246794
## ClusterTransient amplifying progenitors -0.6438046 0.13314435
## ClusterResidual -1.4030929 0.25594961
## Clusterproliferating cTEC 0.5908567 0.27612557
## z.value pvalue p.adjust
## ClustermTEC3 -4.370592 1.239102e-05 1.376780e-05
## ClustermTEC2 -4.883349 1.042991e-06 1.489987e-06
## Clustermature cTEC 17.463152 2.734109e-68 2.734109e-67
## ClustermTEC1 -6.865800 6.611947e-12 1.652987e-11
## ClusterPNP -11.137786 8.213591e-29 4.106796e-28
## Clusterimmature cTEC 6.122759 9.196872e-10 1.839374e-09
## ClusterEarly progenitors 7.313242 2.607733e-13 8.692444e-13
## ClusterTransient amplifying progenitors -4.835388 1.328863e-06 1.661078e-06
## ClusterResidual -5.481911 4.207568e-08 7.012614e-08
## Clusterproliferating cTEC 2.139812 3.236999e-02 3.236999e-02
ggplot(Ncells, aes(x=phenotype, y=(Freq+1)/TotalCells)) +
geom_boxplot() +
geom_point() +
scale_y_log10() +
ylab("proportion of cells") +
facet_grid(cols = vars(cluster_id)) + theme_bw()
singlets@meta.data$phenotype <- as.factor(singlets@meta.data$phenotype)
singlets@meta.data$HTO_classification <- as.factor(singlets@meta.data$HTO_classification)
singlets[["phenotype"]] <- (singlets@meta.data$phenotype) %>%
as.character() %>%
gsub(" ", "_", .) %>%
make.names()
singlets$phenotype <- as.factor(singlets$phenotype)
# Identify clusters to keep
clusters_toKeep <- table(singlets@meta.data$cell_type, singlets@meta.data$phenotype) %>%
t() %>%
apply(., 2, function(x) median(x)) %>%
subset(., is_greater_than(., 100)) %>%
names()
# Aggregate expression data
pbData <- AggregateExpression(singlets, assays = "RNA", group.by = c("cell_type", "HTO_classification"))
pb_counts <- pbData$RNA %>% as.data.frame()
# Perform cluster-wise analysis
clusterwise_pseudobulk_expression_association <- function(cluster, count_data, sample_pheno_data) {
print(paste("evaluating cluster", cluster))
# Get cluster-specific counts
cluster_count_data <- count_data %>% select(starts_with(paste0(cluster)))
col_individuals <- sapply(colnames(cluster_count_data), function(x) strsplit(x, "_")[[1]][2])
temp_indices <- match(col_individuals, sample_pheno_data$HTO_classification)
sample_pheno_data <- sample_pheno_data %>% slice(temp_indices)
# Set up the design matrix
phenotype <- sample_pheno_data %>% mutate(phenotype = as.factor(phenotype)) %>% .$phenotype
# Check if phenotype has at least two levels
if (length(levels(phenotype)) < 1) {
print(paste("Skipping cluster", cluster, "- not enough levels in phenotype"))
return(NULL)
}
mm <- model.matrix(~ phenotype)
colnames(mm) <- levels(phenotype)
row.names(mm) <- colnames(cluster_count_data)
y <- DGEList(counts = cluster_count_data, group = phenotype)
# Filter out low expressed genes
genes_toKeep <- filterByExpr(y, design = mm, min.count=2, min.prop=0.5)
print(paste("no. of genes after filtering = ", sum(genes_toKeep)))
y <- y[genes_toKeep,,keep.lib.sizes=FALSE]
# Normalize for between sample differences
y <- calcNormFactors(y)
# Estimate dispersion
y <- estimateDisp(y, design=mm, robust = TRUE)
# Get the Counts-Per-Million in order to visualize the clustering of the samples in a PCA plot
cpm <- cpm(y)
log2_cpm <- log2(cpm + 0.1*min(cpm[cpm > 0]))
# Calculate log-CPM values with log transformation and prior count stabilization
log_cpm_counts <- log2_cpm
# Estimate variance of the expression of each gene/row
variance_per_gene <- rowVars(log_cpm_counts)
use_features_for_pca <- variance_per_gene %>% sort(., decreasing = TRUE) %>% names(.)
# Pick the top 1000 most variable genes
use_features_for_pca <- use_features_for_pca[1:1000]
log_cpm_counts_t <- t(log_cpm_counts[row.names(log_cpm_counts) %in% use_features_for_pca, ])
# Run PCA on the log-CPM data
res.pca.norm <- prcomp(log_cpm_counts_t, scale = TRUE, rank.=10)
# Create a data frame for visualization
log_cpm_counts_PCs <- data.frame(sample_pheno_data, res.pca.norm$x)
# Print the PCA plot
print(ggplot(log_cpm_counts_PCs, aes(x = PC1, y = PC2, color = phenotype)) +
geom_point(aes(size=0.1)) +
theme_bw() +
ggtitle(paste0("cluster_", cluster)) +
theme(text = element_text(size = 16)))
fit <- glmQLFit(y, design=mm)
qlf <- glmQLFTest(fit, coef=2)
topGenes <- topTags(qlf, n = nrow(y))
diff_res <- data.frame(topGenes$table, cluster = rep(cluster, nrow(topGenes$table)))
return(diff_res)
}
# Apply the function to all clusters to keep
all_diff_res <- lapply(clusters_toKeep, clusterwise_pseudobulk_expression_association, pb_counts, sample_pheno_data)
## [1] "evaluating cluster mTEC3"
## [1] "no. of genes after filtering = 18469"
## [1] "evaluating cluster mTEC2"
## [1] "no. of genes after filtering = 21152"
## [1] "evaluating cluster mature cTEC"
## [1] "no. of genes after filtering = 13917"
## [1] "evaluating cluster mTEC1"
## [1] "no. of genes after filtering = 15574"
## [1] "evaluating cluster PNP"
## [1] "no. of genes after filtering = 14756"
## [1] "evaluating cluster immature cTEC"
## [1] "no. of genes after filtering = 13770"
## [1] "evaluating cluster Early progenitors"
## [1] "no. of genes after filtering = 13322"
## [1] "evaluating cluster Transient amplifying progenitors"
## [1] "no. of genes after filtering = 14192"
all_diff_res %<>% do.call("rbind", .)
##add adjustment for all hypothesis tests - across all genes and all clusters
all_diff_res %<>% mutate(p.adj.global = p.adjust(PValue, "fdr"),
p.adj.local = FDR) %>%
select(-FDR)
mutateddf <- mutate(all_diff_res, sig=ifelse(all_diff_res$p.adj.local<0.01 & (all_diff_res$logFC>1.2 | all_diff_res$logFC<(-1.2)), "Sig", "Not Sig"))
mutateddf <- rownames_to_column(mutateddf, var = "Gene")
mutateddf$cluster <- as.factor(mutateddf$cluster)
input_fg<-subset(mutateddf, sig == "Sig")
# Create volcano plots for each cluster
cluster_mTEC1_bg <- subset(mutateddf, cluster == "mTEC1")
cluster_mTEC1_fg <- subset(input_fg, cluster == "mTEC1")
plot_mTEC1 <- ggplot(cluster_mTEC1_bg, aes(logFC, -log10(p.adj.local))) +
geom_point(color = "grey", alpha = 0.1) +
geom_point(data = cluster_mTEC1_fg, aes(col = sig, alpha = 0.4)) +
ggtitle(paste("mTEC1")) +
theme_bw() +
theme(axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14),
legend.position = "none") +
xlab("logFC") +
ylab("-log10 q value") +
geom_text_repel(data = head(cluster_mTEC1_fg, 20), aes(label = Gene), size = 3,
box.padding = unit(0.1, "lines"),
point.padding = unit(0.1, "lines"))
cluster_mTEC2_bg <- subset(mutateddf, cluster == "mTEC2")
cluster_mTEC2_fg <- subset(input_fg, cluster == "mTEC2")
plot_mTEC2 <- ggplot(cluster_mTEC2_bg, aes(logFC, -log10(p.adj.local))) +
geom_point(color = "grey", alpha = 0.1) +
geom_point(data = cluster_mTEC2_fg, aes(col = sig, alpha = 0.4)) +
ggtitle(paste("mTEC2")) +
theme_bw() +
theme(axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14),
legend.position = "none") +
xlab("logFC") +
ylab("-log10 q value") +
geom_text_repel(data = head(cluster_mTEC2_fg, 20), aes(label = Gene), size = 3,
box.padding = unit(0.1, "lines"),
point.padding = unit(0.1, "lines"))
cluster_mTEC3_bg <- subset(mutateddf, cluster == "mTEC3")
cluster_mTEC3_fg <- subset(input_fg, cluster == "mTEC3")
plot_mTEC3 <- ggplot(cluster_mTEC3_bg, aes(logFC, -log10(p.adj.local))) +
geom_point(color = "grey", alpha = 0.1) +
geom_point(data = cluster_mTEC3_fg, aes(col = sig, alpha = 0.4)) +
ggtitle(paste("mTEC3")) +
theme_bw() +
theme(axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14),
legend.position = "none") +
xlab("logFC") +
ylab("-log10 q value") +
geom_text_repel(data = head(cluster_mTEC3_fg, 20), aes(label = Gene), size = 3,
box.padding = unit(0.1, "lines"),
point.padding = unit(0.1, "lines"))
cluster_PNP_bg <- subset(mutateddf, cluster == "PNP")
cluster_PNP_fg <- subset(input_fg, cluster == "PNP")
plot_PNP <- ggplot(cluster_PNP_bg, aes(logFC, -log10(p.adj.local))) +
geom_point(color = "grey", alpha = 0.1) +
geom_point(data = cluster_PNP_fg, aes(col = sig, alpha = 0.4)) +
ggtitle(paste("PNP")) +
theme_bw() +
theme(axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14),
legend.position = "none") +
xlab("logFC") +
ylab("-log10 q value") +
geom_text_repel(data = head(cluster_PNP_fg, 20), aes(label = Gene), size = 3,
box.padding = unit(0.1, "lines"),
point.padding = unit(0.1, "lines"))
cluster_mcTEC_bg <- subset(mutateddf, cluster == "mature cTEC")
cluster_mcTEC_fg <- subset(input_fg, cluster == "mature cTEC")
plot_mcTEC <- ggplot(cluster_mcTEC_bg, aes(logFC, -log10(p.adj.local))) +
geom_point(color = "grey", alpha = 0.1) +
geom_point(data = cluster_mcTEC_fg, aes(col = sig, alpha = 0.4)) +
ggtitle(paste("mature cTEC")) +
theme_bw() +
theme(axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14),
legend.position = "none") +
xlab("logFC") +
ylab("-log10 q value") +
geom_text_repel(data = head(cluster_mcTEC_fg, 20), aes(label = Gene), size = 3,
box.padding = unit(0.1, "lines"),
point.padding = unit(0.1, "lines"))
cluster_imcTEC_bg <- subset(mutateddf, cluster == "immature cTEC")
cluster_imcTEC_fg <- subset(input_fg, cluster == "immature cTEC")
plot_imcTEC <- ggplot(cluster_imcTEC_bg, aes(logFC, -log10(p.adj.local))) +
geom_point(color = "grey", alpha = 0.1) +
geom_point(data = cluster_imcTEC_fg, aes(col = sig, alpha = 0.4)) +
ggtitle(paste("immature cTEC")) +
theme_bw() +
theme(axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14),
legend.position = "none") +
xlab("logFC") +
ylab("-log10 q value") +
geom_text_repel(data = head(cluster_imcTEC_fg, 20), aes(label = Gene), size = 3,
box.padding = unit(0.1, "lines"),
point.padding = unit(0.1, "lines"))
cluster_EP_bg <- subset(mutateddf, cluster == "Early progenitors")
cluster_EP_fg <- subset(input_fg, cluster == "Early progenitors")
plot_EP <- ggplot(cluster_EP_bg, aes(logFC, -log10(p.adj.local))) +
geom_point(color = "grey", alpha = 0.1) +
geom_point(data = cluster_EP_fg, aes(col = sig, alpha = 0.4)) +
ggtitle(paste("Early progenitors")) +
theme_bw() +
theme(axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14),
legend.position = "none") +
xlab("logFC") +
ylab("-log10 q value") +
geom_text_repel(data = head(cluster_EP_fg, 20), aes(label = Gene), size = 3,
box.padding = unit(0.1, "lines"),
point.padding = unit(0.1, "lines"))
cluster_TAP_bg <- subset(mutateddf, cluster == "Transient amplifying progenitors")
cluster_TAP_fg <- subset(input_fg, cluster == "Transient amplifying progenitors")
plot_TAP <- ggplot(cluster_TAP_bg, aes(logFC, -log10(p.adj.local))) +
geom_point(color = "grey", alpha = 0.1) +
geom_point(data = cluster_TAP_fg, aes(col = sig, alpha = 0.4)) +
ggtitle(paste("Transient amplifying progenitors")) +
theme_bw() +
theme(axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14),
legend.position = "none") +
xlab("logFC") +
ylab("-log10 q value") +
geom_text_repel(data = head(cluster_TAP_fg, 20), aes(label = Gene), size = 3,
box.padding = unit(0.1, "lines"),
point.padding = unit(0.1, "lines"))
plot_mTEC1|plot_mTEC2|plot_mTEC3|plot_PNP
## Warning: ggrepel: 9 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 10 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
plot_mcTEC|plot_imcTEC|plot_TAP|plot_EP
## Warning: ggrepel: 9 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 3 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
DefaultAssay(singlets) <- "SCT"
p5 <- FeaturePlot_scCustom(singlets, features=c("Ackr4", "Dll4", "Psmb11", "Ccl21a", "Aire", "Fezf2"), reduction="umap", colors_use= viridis_plasma_dark_high, na_color="lightgray", pt.size=0.5, label=FALSE)& NoAxes() & NoLegend()
p5
wt1_data <- subset(x = singlets, subset = HTO_classification == "WT1")
wt2_data <- subset(x = singlets, subset = HTO_classification == "WT2")
wt3_data <- subset(x = singlets, subset = HTO_classification == "WT3")
ko1_data <- subset(x = singlets, subset = HTO_classification == "KO1")
ko2_data <- subset(x = singlets, subset = HTO_classification == "KO2")
ko3_data <- subset(x = singlets, subset = HTO_classification == "KO3")
gsva_wt1 <- analyse_sc_clusters(wt1_data, use_interactors = FALSE, verbose = TRUE)
## Calculating average cluster expression...
## Converting expression data to string... (This may take a moment)
## Conversion complete
## Submitting request to Reactome API...
## Compressing request data...
## Reactome Analysis submitted succesfully
## Converting dataset Seurat...
## Mapping identifiers...
## Performing gene set analysis using ssGSEA
## Analysing dataset 'Seurat' using ssGSEA
## Retrieving result...
gsva_wt2 <- analyse_sc_clusters(wt2_data, use_interactors = FALSE, verbose = TRUE)
## Calculating average cluster expression...
## Converting expression data to string... (This may take a moment)
## Conversion complete
## Submitting request to Reactome API...
## Compressing request data...
## Reactome Analysis submitted succesfully
## Converting dataset Seurat...
## Mapping identifiers...
## Performing gene set analysis using ssGSEA
## Analysing dataset 'Seurat' using ssGSEA
## Retrieving result...
gsva_wt3 <- analyse_sc_clusters(wt3_data, use_interactors = FALSE, verbose = TRUE)
## Calculating average cluster expression...
## Converting expression data to string... (This may take a moment)
## Conversion complete
## Submitting request to Reactome API...
## Compressing request data...
## Reactome Analysis submitted succesfully
## Converting dataset Seurat...
## Mapping identifiers...
## Performing gene set analysis using ssGSEA
## Analysing dataset 'Seurat' using ssGSEA
## Retrieving result...
gsva_ko1 <- analyse_sc_clusters(ko1_data, use_interactors = FALSE, verbose = TRUE)
## Calculating average cluster expression...
## Converting expression data to string... (This may take a moment)
## Conversion complete
## Submitting request to Reactome API...
## Compressing request data...
## Reactome Analysis submitted succesfully
## Converting dataset Seurat...
## Mapping identifiers...
## Performing gene set analysis using ssGSEA
## Analysing dataset 'Seurat' using ssGSEA
## Retrieving result...
gsva_ko2 <- analyse_sc_clusters(ko2_data, use_interactors = FALSE, verbose = TRUE)
## Calculating average cluster expression...
## Converting expression data to string... (This may take a moment)
## Conversion complete
## Submitting request to Reactome API...
## Compressing request data...
## Reactome Analysis submitted succesfully
## Converting dataset Seurat...
## Mapping identifiers...
## Performing gene set analysis using ssGSEA
## Analysing dataset 'Seurat' using ssGSEA
## Retrieving result...
gsva_ko3 <- analyse_sc_clusters(ko3_data, use_interactors = FALSE, verbose = TRUE)
## Calculating average cluster expression...
## Converting expression data to string... (This may take a moment)
## Conversion complete
## Submitting request to Reactome API...
## Compressing request data...
## Reactome Analysis submitted succesfully
## Converting dataset Seurat...
## Mapping identifiers...
## Performing gene set analysis using ssGSEA
## Analysing dataset 'Seurat' using ssGSEA
## Retrieving result...
pathway_wt1 <- pathways(gsva_wt1)
pathway_wt2 <- pathways(gsva_wt2)
pathway_wt3 <- pathways(gsva_wt3)
pathway_ko1 <- pathways(gsva_ko1)
pathway_ko2 <- pathways(gsva_ko2)
pathway_ko3 <- pathways(gsva_ko3)
colnames(pathway_wt1) <- gsub("\\.Seurat", "", colnames(pathway_wt1))
colnames(pathway_wt2) <- gsub("\\.Seurat", "", colnames(pathway_wt2))
colnames(pathway_wt3) <- gsub("\\.Seurat", "", colnames(pathway_wt3))
colnames(pathway_ko1) <- gsub("\\.Seurat", "", colnames(pathway_ko1))
colnames(pathway_ko2) <- gsub("\\.Seurat", "", colnames(pathway_ko2))
colnames(pathway_ko3) <- gsub("\\.Seurat", "", colnames(pathway_ko3))
library(tidyr)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:gdata':
##
## starts_with
## The following object is masked from 'package:magrittr':
##
## extract
## The following objects are masked from 'package:Matrix':
##
## expand, pack, unpack
long_wt1 <- gather(pathway_wt1, cluster, score, Early_progenitors:Transient_amplifying_progenitors, factor_key=TRUE)
long_wt2 <- gather(pathway_wt2, cluster, score, Early_progenitors:Transient_amplifying_progenitors, factor_key=TRUE)
long_wt3 <- gather(pathway_wt3, cluster, score, Early_progenitors:Transient_amplifying_progenitors, factor_key=TRUE)
long_ko1 <- gather(pathway_ko1, cluster, score, Early_progenitors:Transient_amplifying_progenitors, factor_key=TRUE)
long_ko2 <- gather(pathway_ko2, cluster, score, Early_progenitors:Transient_amplifying_progenitors, factor_key=TRUE)
long_ko3 <- gather(pathway_ko3, cluster, score, Early_progenitors:Transient_amplifying_progenitors, factor_key=TRUE)
long_wt1$HTO <- 'WT1'
long_wt2$HTO <- 'WT2'
long_wt3$HTO <- 'WT3'
long_ko1$HTO <- 'KO1'
long_ko2$HTO <- 'KO2'
long_ko3$HTO <- 'KO3'
pathway <- rbind(long_wt1, long_wt2, long_wt3, long_ko1, long_ko2, long_ko3)
pathway_wide <- spread(pathway, HTO, score)
pathway_wide<- pathway_wide[complete.cases(pathway_wide), ]
pathway_wide <- subset(pathway_wide, pathway_wide$cluster != "Residual")
pathway_long <- gather(pathway_wide, HTO, score, KO1:WT3, factor_key=TRUE)
hto_phenotype_map <- data.frame(
HTO = c("WT1", "WT2", "WT3", "KO1", "KO2", "KO3"),
phenotype = c("WT", "WT", "WT", "KO", "KO", "KO")
)
df <- pathway_long %>%
left_join(hto_phenotype_map, by = "HTO")
long_wt <- gather(pathway_wt, cluster, score, Early_progenitors:Transient_amplifying_progenitors, factor_key=TRUE)
calculate_p_values <- function(df) {
p_values_df <- df %>%
group_by(Name, cluster) %>%
summarize(p_value = t.test(score[phenotype == "WT"], score[phenotype == "KO"])$p.value) %>%
ungroup()
return(p_values_df)
}
p_values <- calculate_p_values(df)
## `summarise()` has grouped output by 'Name'. You can override using the
## `.groups` argument.
print(p_values)
## # A tibble: 17,469 Ă— 3
## Name cluster p_value
## <chr> <fct> <dbl>
## 1 5-Phosphoribose 1-diphosphate biosynthesis Early_… 0.0414
## 2 5-Phosphoribose 1-diphosphate biosynthesis immatu… 0.268
## 3 5-Phosphoribose 1-diphosphate biosynthesis mature… 0.720
## 4 5-Phosphoribose 1-diphosphate biosynthesis mTEC1 0.556
## 5 5-Phosphoribose 1-diphosphate biosynthesis mTEC2 0.535
## 6 5-Phosphoribose 1-diphosphate biosynthesis mTEC3 0.0352
## 7 5-Phosphoribose 1-diphosphate biosynthesis PNP 0.372
## 8 5-Phosphoribose 1-diphosphate biosynthesis prolif… 0.770
## 9 5-Phosphoribose 1-diphosphate biosynthesis Transi… 0.182
## 10 A tetrasaccharide linker sequence is required for GAG synthe… Early_… 0.767
## # ℹ 17,459 more rows
library(dplyr)
mean_df <- df %>%
group_by(Name, cluster, phenotype) %>%
summarize(mean_score = mean(score, na.rm = TRUE), .groups = "drop") %>%
pivot_wider(names_from = phenotype, values_from = mean_score)
combined_df <- dplyr::left_join(mean_df, p_values, by = c("Name", "cluster"))
sig_df <- subset(combined_df, p_value< 0.01)
sig_df$diff <- sig_df$KO - sig_df$WT
# Assuming your data frame is called 'df'
# and the columns are named 'KO', 'WT', 'difference', and 'cluster'
result_list <- list() # Initialize an empty list to store results
unique_clusters <- unique(sig_df$cluster) # Get unique cluster values
for (cluster_val in unique_clusters) {
cluster_df <- sig_df[sig_df$cluster == cluster_val, ] # Subset data for the current cluster
if (nrow(cluster_df) > 0) { # Check if there are any rows for the current cluster
cluster_df_ordered <- cluster_df[order(cluster_df$diff), ] # Order by difference
top3 <- tail(cluster_df_ordered, 3)
bottom3 <- head(cluster_df_ordered, 3)
result_list[[cluster_val]] <- rbind(bottom3, top3) # Store results in the list, including all columns
}else{
result_list[[cluster_val]] <- NULL # if no rows for the cluster, store NULL.
}
}
# result_list now contains a list of data frames,
# where each element is the top 3 and bottom 3 rows
# for a specific cluster.
# Example of accessing the result for cluster "A":
# result_list$A
#If you want to combine everything into one dataframe, but keep track of the cluster.
combined_df <- do.call(rbind, lapply(names(result_list), function(name) {
if(!is.null(result_list[[name]])){
cbind(result_list[[name]], cluster_original = name) # Retains all original columns
}
}))
plot <- ggplot(
combined_df %>%
pivot_longer(cols = c("WT", "KO"), names_to = "phenotype", values_to = "mean_score") %>%
group_by(cluster_original) %>%
mutate(
Name = factor(
Name,
levels = unique((.) %>% filter(phenotype == "KO") %>% arrange(desc(mean_score)) %>% pull(Name))
)
),
aes(y = Name, x = mean_score, fill = phenotype)
) +
geom_bar(stat = "identity", position = "dodge") +
facet_wrap(~ cluster_original, scales = "free_y") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(
title = "Mean Pathway Scores by Cluster and Phenotype",
x = "Mean Score",
y = "Name",
fill = "Phenotype"
)
plot
sessionInfo()
## R version 4.4.0 (2024-04-24)
## Platform: aarch64-apple-darwin20
## Running under: macOS 15.3.1
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.0
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/Toronto
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] tidyr_1.3.1 gdata_3.0.1 ReactomeGSA_1.18.0 decoupleR_2.10.0
## [5] SCpubr_2.0.2 openxlsx_4.2.8 HGNChelper_0.8.15 ggrepel_0.9.6
## [9] tibble_3.2.1 matrixStats_1.5.0 edgeR_4.2.2 limma_3.60.6
## [13] presto_1.0.0 data.table_1.17.0 Rcpp_1.0.14 scCustomize_3.0.1
## [17] magrittr_2.0.3 lme4_1.1-36 Matrix_1.7-3 glmGamPoi_1.16.0
## [21] ggplot2_3.5.1 patchwork_1.3.0 sctransform_0.4.1 Seurat_5.2.1
## [25] SeuratObject_5.0.2 sp_2.2-0 dplyr_1.1.4
##
## loaded via a namespace (and not attached):
## [1] fs_1.6.5 spatstat.sparse_3.1-0
## [3] bitops_1.0-9 lubridate_1.9.4
## [5] httr_1.4.7 RColorBrewer_1.1-3
## [7] tools_4.4.0 utf8_1.2.4
## [9] R6_2.6.1 lazyeval_0.2.2
## [11] uwot_0.2.3 withr_3.0.2
## [13] prettyunits_1.2.0 gridExtra_2.3
## [15] progressr_0.15.1 cli_3.6.4
## [17] Biobase_2.64.0 spatstat.explore_3.4-2
## [19] fastDummies_1.7.5 labeling_0.4.3
## [21] sass_0.4.9 prismatic_1.1.2
## [23] spatstat.data_3.1-6 ggridges_0.5.6
## [25] pbapply_1.7-2 yulab.utils_0.2.0
## [27] R.utils_2.13.0 parallelly_1.42.0
## [29] rstudioapi_0.17.1 generics_0.1.3
## [31] gridGraphics_0.5-1 shape_1.4.6.1
## [33] gtools_3.9.5 ica_1.0-3
## [35] spatstat.random_3.3-3 zip_2.3.2
## [37] ggbeeswarm_0.7.2 S4Vectors_0.42.1
## [39] abind_1.4-8 R.methodsS3_1.8.2
## [41] lifecycle_1.0.4 yaml_2.3.10
## [43] snakecase_0.11.1 SummarizedExperiment_1.34.0
## [45] gplots_3.2.0 SparseArray_1.4.8
## [47] Rtsne_0.17 paletteer_1.6.0
## [49] grid_4.4.0 promises_1.3.2
## [51] crayon_1.5.3 miniUI_0.1.1.1
## [53] lattice_0.22-6 cowplot_1.1.3
## [55] pillar_1.10.1 knitr_1.50
## [57] GenomicRanges_1.56.2 boot_1.3-31
## [59] future.apply_1.11.3 codetools_0.2-20
## [61] glue_1.8.0 spatstat.univar_3.1-2
## [63] vctrs_0.6.5 png_0.1-8
## [65] spam_2.11-1 Rdpack_2.6.3
## [67] gtable_0.3.6 assertthat_0.2.1
## [69] rematch2_2.1.2 cachem_1.1.0
## [71] xfun_0.51 rbibutils_2.3
## [73] S4Arrays_1.4.1 mime_0.13
## [75] reformulas_0.4.0 survival_3.8-3
## [77] statmod_1.5.0 fitdistrplus_1.2-2
## [79] ROCR_1.0-11 nlme_3.1-167
## [81] progress_1.2.3 RcppAnnoy_0.0.22
## [83] GenomeInfoDb_1.40.1 bslib_0.9.0
## [85] irlba_2.3.5.1 vipor_0.4.7
## [87] KernSmooth_2.23-26 splitstackshape_1.4.8
## [89] colorspace_2.1-1 BiocGenerics_0.50.0
## [91] ggrastr_1.0.2 tidyselect_1.2.1
## [93] compiler_4.4.0 curl_6.2.1
## [95] DelayedArray_0.30.1 plotly_4.10.4
## [97] scales_1.3.0 caTools_1.18.3
## [99] lmtest_0.9-40 stringr_1.5.1
## [101] digest_0.6.37 goftest_1.2-3
## [103] spatstat.utils_3.1-3 minqa_1.2.8
## [105] rmarkdown_2.29 XVector_0.44.0
## [107] htmltools_0.5.8.1 pkgconfig_2.0.3
## [109] sparseMatrixStats_1.16.0 MatrixGenerics_1.16.0
## [111] fastmap_1.2.0 rlang_1.1.5
## [113] GlobalOptions_0.1.2 htmlwidgets_1.6.4
## [115] UCSC.utils_1.0.0 shiny_1.10.0
## [117] DelayedMatrixStats_1.26.0 farver_2.1.2
## [119] jquerylib_0.1.4 zoo_1.8-13
## [121] jsonlite_1.9.1 BiocParallel_1.38.0
## [123] R.oo_1.27.0 GenomeInfoDbData_1.2.12
## [125] ggplotify_0.1.2 dotCall64_1.2
## [127] munsell_0.5.1 viridis_0.6.5
## [129] reticulate_1.41.0.1 stringi_1.8.4
## [131] zlibbioc_1.50.0 MASS_7.3-65
## [133] plyr_1.8.9 parallel_4.4.0
## [135] listenv_0.9.1 forcats_1.0.0
## [137] deldir_2.0-4 splines_4.4.0
## [139] tensor_1.5 hms_1.1.3
## [141] circlize_0.4.16 locfit_1.5-9.12
## [143] igraph_2.1.4 spatstat.geom_3.3-6
## [145] RcppHNSW_0.6.0 reshape2_1.4.4
## [147] stats4_4.4.0 evaluate_1.0.3
## [149] ggprism_1.0.5 nloptr_2.2.1
## [151] httpuv_1.6.15 RANN_2.6.2
## [153] purrr_1.0.4 polyclip_1.10-7
## [155] future_1.34.0 scattermore_1.2
## [157] janitor_2.2.1 xtable_1.8-4
## [159] RSpectra_0.16-2 later_1.4.1
## [161] viridisLite_0.4.2 beeswarm_0.4.0
## [163] IRanges_2.38.1 cluster_2.1.8.1
## [165] timechange_0.3.0 globals_0.16.3